home *** CD-ROM | disk | FTP | other *** search
- ###############################################################################
- ###############################################################################
- # HtmlParser.tcl
- ###############################################################################
- ###############################################################################
- # In this file are implemented the procedures used to parse Html file links.
- ###############################################################################
- ###############################################################################
- # Copyright 2000-2004 AndrΘs Garcφa Garcφa -- fandom@retemail.es
- # Distributed under the terms of the GPL v2
- ###############################################################################
- ###############################################################################
- namespace eval HtmlParser {
-
- ###############################################################################
- # SetEntities
- # Initializes the arrays with the translation for Html entities, something
- # like 'entity(lt)==>'
- ###############################################################################
- proc SetEntities {} {
- variable entities
-
- set entities(quot) \"
- set entities(amp) \\&
- set entities(lt) <
- set entities(gt) >
- set entities(nbsp) {}
- set entities(iexcl) í
- set entities(cent) ó
- set entities(pound) ú
- set entities(curren) ñ
- set entities(yen) Ñ
- set entities(brvbar) \|
- set entities(sect) º
- set entities(uml) ¿
- set entities(copy) ⌐
- set entities(ordf) ¬
- set entities(laquo) ½
- set entities(not) ¼
- set entities(shy) ¡
- set entities(reg) «
- set entities(macr) »
- set entities(deg) ░
- set entities(plusmn) ▒
- set entities(sup2) ▓
- set entities(sup3) │
- set entities(acute) ┤
- set entities(micro) ╡
- set entities(para) ╢
- set entities(middot) ╖
- set entities(cedil) ╕
- set entities(sup1) ╣
- set entities(ordm) ║
- set entities(raquo) ╗
- set entities(frac14) ╝
- set entities(frac12) ╜
- set entities(frac34) ╛
- set entities(iquest) ┐
- set entities(ntilde) ±
- set entities(Agrave) └
- set entities(Aacute) ┴
- set entities(Acirc) ┬
- set entities(Atilde) ├
- set entities(Auml) ─
- set entities(Aring) ┼
- set entities(AElig) ╞
- set entities(Ccedil) ╟
- set entities(Egrave) ╚
- set entities(Eacute) ╔
- set entities(Ecirc) ╩
- set entities(Euml) ╦
- set entities(Igrave) ╠
- set entities(Iacute) ═
- set entities(Icirc) ╬
- set entities(Iuml) ╧
- set entities(ETH) ╨
- set entities(Ntilde) ╤
- set entities(Ograve) ╥
- set entities(Oacute) ╙
- set entities(Ocirc) ╘
- set entities(Otilde) ╒
- set entities(Ouml) ╓
- set entities(times) ╫
- set entities(Oslash) ╪
- set entities(Ugrave) ┘
- set entities(Uacute) ┌
- set entities(Ucirc) █
- set entities(Uuml) ▄
- set entities(Yacute) ▌
- set entities(THORN) ▐
- set entities(szlig) ▀
- set entities(agrave) α
- set entities(aacute) ß
- set entities(acirc) Γ
- set entities(atilde) π
- set entities(auml) Σ
- set entities(aring) σ
- set entities(aelig) µ
- set entities(ccedil) τ
- set entities(egrave) Φ
- set entities(eacute) Θ
- set entities(ecirc) Ω
- set entities(euml) δ
- set entities(igrave) ∞
- set entities(iacute) φ
- set entities(icirc) ε
- set entities(iuml) ∩
- set entities(eth) ≡
- set entities(ntilde) ±
- set entities(ograve) ≥
- set entities(oacute) ≤
- set entities(ocirc) ⌠
- set entities(otilde) ⌡
- set entities(ouml) ÷
- set entities(divide) ≈
- set entities(oslash) °
- set entities(ugrave) ∙
- set entities(uacute) ·
- set entities(ucirc) √
- set entities(uuml) ⁿ
- set entities(yacute) ²
- set entities(thorn) ■
- set entities(yuml)
-
- return
- }
-
- ###############################################################################
- # ShowLinks
- # Show the links found in the last preprocessed page, it's only good for
- # debugging.
- ###############################################################################
- proc ShowLinks {} {
- variable nLinks
- variable links
-
- for {set i 1} {$i<$nLinks} {incr i} {
- set description [TidyDescription $links($i,descrip) $links($i,url)]
- if {[info exists links($i,type)]} {
- puts "\n$i: $links($i,file) - Bajar: $links($i,ok)"
- puts "$description - $links($i,type)"
- puts "$links($i,url)"
- } else {
- puts "\n$i: $links($i,file) - Bajar: $links($i,ok)"
- puts "$description"
- puts "$links($i,url)"
- }
- }
- return
- }
-
- ###############################################################################
- # ParseUrl
- # Given an url 'ParseUrl' will split it in its parts: protocol, domain,
- # directory and filename
- #
- # Parameter
- # The url to be parsed,
- #
- # Returns
- # A list with the url split as mentioned above or '1' if the url couldn't be
- # parsed.
- ###############################################################################
- proc ParseUrl {url} {
-
- if {[regexp -nocase \
- {(([^:]*)(?:://))?([^/]+)(((?:~[^/]*)?(?:[^\?]*))(?:/)([^#]*))?} \
- $url nada nada protocol domain nada dir fileName]} {
- if {$protocol==""} {
- set protocol http
- }
- return [list $protocol $domain $dir $fileName]
- }
-
- return 1
- }
-
- ###############################################################################
- # TidyDir
- # Takes things like ".." and "." from the absolute path.
- #
- # Parameter:
- # File path.
- #
- # Returns:
- # The tidied file path.
- ###############################################################################
- proc TidyDir {path} {
- if {[regexp {\.$} $path]} {
- append path /
- }
- for {set a 1 ; set b 1} {($a>0)||($b>0)} {} {
- set a [regsub -all {/\./} $path {/} path]
- set b [regsub -all {([^./]+/\.\./)} $path {} path]
- }
- for {} {[regsub {^/\.\.} $path {} path]} {} {}
-
- return $path
- }
-
- ###############################################################################
- # RemoveEntities
- # Given a link or a link description, this procecedure subtitutes the
- # Html character entities for the real thing, for example '&' gets
- # changed to '&'.
- #
- # Parameter
- # The string to process.
- #
- # Returns
- # The string processed.
- ##############################################################################
- proc RemoveEntities {string} {
- variable entities
-
- while {[regexp {(?:&)([^ ;]+)(;)?} $string old entity]} {
- regsub {#} $entity {} entity
- # Eventually this should be replaced with "string is number"
- if {[regexp {^[0-9]+$} $entity]} {
- if {[catch {format %c $entity} new]} {
- break
- }
- regsub -all {([\\])} $new {\\\1} new
- } else {
- if {[catch {set ::HtmlParser::entities($entity)} new]} {
- break
- }
- }
- if {$new=="&"} {
- set new "\\&"
- }
- regsub -all $old $string $new string
- }
- return $string
- }
-
- ###############################################################################
- # TidyLinks
- # Removes Html character entities from the links. It seems that if a
- # file is, for example, called 'me&you.jpg' some webmasters or Html editors
- # will put 'me&you.jpg' in the link.
- #
- # Side efects
- # The links in 'links' will contain no character entities
- ################################################################################
- proc TidyLinks {} {
- variable nLinks
- variable links
-
- for {set i 1} {$i<$nLinks} {incr i} {
- set links($i,file) [RemoveEntities $links($i,file)]
- }
- return
- }
-
- ###############################################################################
- # TidyDescription
- # Translates for human eyes the description of the links.
- #
- # Parameters:
- # description: The description to be translated.
- # url: The url for the translation
- #
- # Returns:
- # The description translated.
- ###############################################################################
- proc TidyDescription {description url} {
- global labelDialogs
- variable entities
- variable nLinks
- variable links
-
- if {[regexp {^[\s]*$} $description]} {
- return $url
- }
- if {[regexp -nocase {(<img)} $description]} {
- regsub -all {<.*?>} $description {} tmp
- if {![regexp {^\s*$} $tmp]} {
- set description $tmp
- } else {
- set description [GetFileName $description alt]
- if {$description==1} {
- set description $url
- }
- set description "$labelDialogs(linkImage): $description"
- }
- }
- regsub -all {<.*?>} $description {} description
-
- set description [RemoveEntities $description]
- regsub -all {\s+} $description { } description
- regsub {^\s} $description {} description
-
- if {$description==""} {
- return $url
- }
- return $description
- }
-
- ###############################################################################
- # ChangeEncoding
- # Changes the encoding in which the description of the links are written
- #
- # Parameters:
- # newEncoding: The encoding to use.
- ###############################################################################
- proc ChangeEncoding {newEncoding} {
- variable nLinks
- variable links
- global dirGetleft
-
- for {set i 1} {$i<$nLinks} {incr i} {
- set links($i,descrip) \
- [encoding convertfrom "$newEncoding" $links($i,descrip)]
- }
-
- return
- }
-
- ###############################################################################
- # GetFileName
- # Extrac the filename of the link, or the description in the 'alt' field
- # from whatever it gets send.
- #
- # Parameter
- # tag: the string to process.
- # type: what we are looking for: "href", "src" or "alt".
- #
- # Returns:
- # The filename or '1' if none was found.
- ###############################################################################
- proc GetFileName {tag type} {
-
- regsub -all {\s*=\s*} $tag {=} tag
- if {[regexp -nocase -expanded [subst -nocommand {
- (?:$type=)
- (?:(?:(?:\")([^\"]+))| # Filename between ""
- (?:(?:\')([^\']+))| # Between ''
- ([^\ \"'>]+)) # No delimiter
- }] $tag nada a b c]} {
- if {$a!=""} {
- set fileName $a
- } elseif {$b!=""} {
- set fileName $b
- } elseif {$c!=""} {
- set fileName $c
- }
- if {[regexp {^(&)(.*)(;)$} $fileName]} {
- return 1
- }
- if {[regexp -nocase {^mailto:|^news:|^javascript:} $fileName]} {
- return 1
- }
- # Workaround for Webmasters that don't know the directory separator
- # is a single / or feel like using the windows one \.
- regsub -all {([^:])(//)|(\\)} $fileName {\1/} fileName
- # It seems you can write a link as //site.com/file.html
- regsub {^//} $fileName {http://} fileName
- # The following is due to Javascript variables.
- return $fileName
- }
- # If the link happens to be written like: <a href="">index.html</a>
- if {[regexp -nocase [subst -nocommand {(?:$type=)((\"\")|(\'\'))}] $tag]} {
- return ""
- }
- return 1
- }
-
- ###############################################################################
- # CompleteString
- # Reads from the channel 'leer' until the 'cosa' string includes the
- # substring passed as a parameter.
- #
- # Parameters:
- # cadena: name of the variable with the string to complete.
- # leer: channel to read from.
- # pattern: substring to look for in the channel.
- #
- # Returns:
- # - '0': No errors.
- # - '1': The string could not be completed.
- #
- # Side efects:
- # 'cosa' is completed.
- ###############################################################################
- proc CompleteString {cadena leer pattern} {
-
- upvar $cadena cosa
- while {![regexp -nocase [subst -nocommand {$pattern}] $cosa]} {
- append cosa [read $leer 20]
- if {[eof $leer]} {
- return 1
- }
- }
- return 0
- }
-
- ###############################################################################
- # Parsing
- # Reads the Web page passed as a parameter and proccess it to extract
- # all links.
- #
- # Parameters:
- # file: File which contains the page to process.
- # referer: The referer link for the page.
- # level: The level in which we found the file to parse.
- #
- # Returns:
- # - '0': No errors.
- # - '1': Couldn't open file.
- #
- # Side efects:
- # 'nLinks': number of links plus one.
- # 'links' : keeps all the info about the links.
- ###############################################################################
- proc Parsing {file referer level} {
- global labelDialogs labelTitles getleftOptions
- variable nLinks
- variable links
- variable pageEncoding
- variable baseTag
-
- if {[string match $file ""]} return
- set nLinks 1
- catch {unset links}
-
- if {[catch {open $file r} leer]} {
- return 1
- }
- catch {unset linkType}
- set pageEncoding ""
- for {set cosa ""; set thumbnailNext 0 ; set newBase "" ; set baseTag ""} \
- {(![eof $leer]) || ([string compare $cosa ""])} {} {
- if {$getleftOptions(pauseNow)==1} {
- tkwait variable getleftOptions(pauseNow)
- }
- if {![regexp -nocase {((<)(a|b|l|i|f|s|m|t)(.*))|(<$)} $cosa cosa]} {
- set cosa [read $leer 50]
- continue
- }
- set result [CompleteString cosa $leer ">"]
- if {$result==1} break
- regexp {(?:<)([^>]*)(?:>)(.*)} $cosa nada tag cosa
-
- set lowerTag [string tolower $tag]
-
- set fileName 1
- switch -regexp $lowerTag {
- "^area" {
- if {[set fileName [GetFileName $tag href]]!=1} {
- if {[set descrip [GetFileName $tag alt]]!=1} {
- set links($nLinks,descrip) $descrip
- } else {
- set links($nLinks,descrip) "$labelDialogs(map)"
- }
- }
- }
- "^a.+href" {
- if {[set fileName [GetFileName $tag href]]!=1} {
- if {[regexp -nocase {(javascript)(.*)(\()} $fileName]} continue
- # It so happens you can skip closing the link, browsers
- # consider them closed if you open another one.
- set result [CompleteString cosa $leer {(<)((/a)|(a))}]
- if {$result==1} break
- regexp -nocase {(.*?)(?:(<)((/a)|(a)))} $cosa nada descripcion
- set links($nLinks,descrip) $descripcion
- if {[string match $descripcion ""]} {
- set fileName 1
- }
- if {[regexp -nocase {<img[^>]* src} $descripcion]} {
- set thumbnailNext 1
- } else {
- regexp {(.*?)(<)} $descripcion descripcion
- }
- }
- }
- "^img" {
- if {[set fileName [GetFileName $tag src]]!=1} {
- if {[set descrip [GetFileName $tag alt]]==1} {
- set descrip $fileName
- }
- set links($nLinks,descrip) "$labelDialogs(image): $descrip"
- if {$thumbnailNext==0} {
- set links($nLinks,type) image
- } else {
- set links($nLinks,type) thumb
- set thumbnailNext 0
- }
- }
- }
- ^script {
- if {![regexp -nocase {/script>} $cosa]} {
- set result [CompleteString cosa $leer "/script>"]
- if {$result==1} break
- }
- regexp -nocase {(?:/script>)(.*)} $cosa nada cosa
- if {[set fileName [GetFileName $tag src]]!=1} {
- set links($nLinks,descrip) "Script: $fileName"
- }
- }
- "^frame" {
- if {[set fileName [GetFileName $tag src]]!=1} {
- set links($nLinks,descrip) "$labelDialogs(frame): $fileName"
- }
- }
- "^base" {
- if {[set fileName [GetFileName $tag href]]!=1} {
- set baseTag $tag
- set newBase [CompleteUrl $fileName $referer ""]
- set fileName 1
- }
- }
- "^link.+href" {
- if {[set fileName [GetFileName $tag href]]!=1} {
- set links($nLinks,descrip) "$labelDialogs(css)"
- }
- }
- ^meta {
- if {![regexp {(?:meta *)(?:charset=)(?:\"|')?([^\"' ]*)} $tag \
- nada pageEncoding]} {
- if {[set fileName [GetFileName $tag url]]!=1} {
- set links($nLinks,descrip) \
- "$labelDialogs(relocation): $fileName"
- }
- }
- }
- ^table|^td|^th|^tr|^layer|^ilayer|^body {
- if {[set fileName [GetFileName $tag background]]!=1} {
- set links($nLinks,descrip) "$labelDialogs(image): $fileName"
- }
- }
-
- }
- if {$fileName!=1} {
- # set newName $fileName
- if {![regexp {^#} $fileName]} {
- set links($nLinks,file) $fileName
- set url [CompleteUrl $fileName $referer $newBase]
- set links($nLinks,url) $url
- incr nLinks
- }
- }
- }
- close $leer
- return 0
- }
-
- ###############################################################################
- # FilterLinks
- # Filters the links extracted from a page according to the rules given.
- #
- # Parameters
- # referer: The url of the page we got the links from.
- # linkArray: Name of the array where the links are stored
- # level: The recursion level in which we found the referer.
- # externalLevel: and the recursion level for links outside de domain.
- #
- # Side effects:
- # 'nLinks' and 'links' are upated to the new, filtered links
- ###############################################################################
- proc FilterLinks {referer linkArray level {externalLevel 0}} {
- global downOptions siteUrl directories urlsDownloaded
-
- upvar #0 $linkArray links
-
- set baseSite $siteUrl(www)
- regexp {(.*)(:)} $siteUrl(www) nada baseSite
-
- for {set i 1} {[info exists links($i,url)]} {incr i} {
- set link $links($i,url)
- set links($i,ok) 1
- if {([regexp {\.ram$} $link])} {
- set links($i,ok) 0
- continue
- }
- if {($downOptions(filter)!="")&&([regexp -nocase "$downOptions(filter)" \
- $links($i,file)])} {
- set links($i,ok) 0
- continue
- }
- if {([regexp {\?} $link])&&($downOptions(cgi)==0)} {
- set links($i,ok) 0
- continue
- }
- if {[info exists urlsDownloaded($link)]} {
- set links($i,ok) 0
- continue
- }
- if {[regexp {^ftp:.*/$} $link]} {
- set links($i,ok) 0
- continue
- }
- if {[regexp -nocase {^https://} $link]} {
- set links($i,ok) 0
- continue
- }
- set parsedUrl [ParseUrl $link]
- set protocol [lindex $parsedUrl 0]
- set direccion_www [lindex $parsedUrl 1]
- set directory [lindex $parsedUrl 2]
- if {![info exists direccion_www]} {
- set links($i,ok) 0 ; # Maybe something should be put in the error \
- log about this.
- continue
- }
- # www.domain.com and www.domain.com:8080 will be considered the
- # same site.
- set linkSite $direccion_www
- regexp {(.*)(:)} $direccion_www nada linkSite
- if {[string compare [string tolower $baseSite] \
- [string tolower $linkSite]]} {
- if {$downOptions(exLevels)<=$externalLevel} {
- set links($i,ok) 0
- continue
- }
- set externalLink 1
- } else {
- set externalLink 0
- }
- if {($downOptions(dir)==0)&&($siteUrl(base)!="")&&($externalLink==0)} {
- if {![regexp -nocase "^$siteUrl(base)" $directory]} {
- set links($i,ok) 0
- continue
- }
- }
- if {($downOptions(levels)!=-1)&&($downOptions(levels)<=$level)} {
- set links($i,ok) 0
- continue
- }
-
- set remove 0
-
- catch {
- if {($links([expr {$i+1}],type)=="thumb")&&($downOptions(images)==1)} {
- if {[regexp -nocase {(gif$)|(jpg$)|(jpeg$)|(bmp$)|(xbm$)|(tiff$)|(png$)}\
- $links($i,file)]} {
- set remove 1
- }
- }
-
- }
- catch {
- if {($links($i,type)=="thumb")&&($downOptions(images)==2)} {
- set remove 1
- }
- }
- if {$remove==1} {
- set links($i,ok) 0
- continue
- }
- }
- return
- }
-
- ###############################################################################
- # CompleteUrl
- # Given a link, this procedure returns the full Url of that link, for
- # example, a link from a page may be '../index.html', this procedure
- # will return something like 'http://www.algo.es/cosas/index.html'
- #
- # Parameter
- # link: I'll let you guess
- # referer: url of the referrer page for the link
- # newBase: In case the page contains a 'BASE' tag, this will have the
- # url to use as base for the links.
- #
- # Returns
- # The url
- ###############################################################################
- proc CompleteUrl {link referer newBase} {
- global siteUrl
-
- set link [RemoveEntities $link]
- if {[regexp {://} $link]} {
- # cgi links may have a http:// and still be relative
- if {![regexp {(\?)(.*)(://)} $link]} {
- return $link
- }
- }
-
- if {$newBase==""} {
- set parsedUrl [ParseUrl $referer]
- } else {
- set parsedUrl [ParseUrl $newBase]
- }
-
- set prot [lindex $parsedUrl 0]
- set domain [lindex $parsedUrl 1]
- set dir [lindex $parsedUrl 2]
-
- if {[regexp {(?::/)([^/].*)} $link nada fileName]} {
- set url $prot://$domain/$fileName
- return $url
- }
- if {[regexp {^/} $link]} {
- set url $prot://$domain$link
- return $url
- }
- set fileName [TidyDir $dir/$link]
- set url "$prot://$domain$fileName"
-
- return $url
- }
-
- SetEntities
-
- }
-